In [7]:
%matplotlib inline

from math import pi
import pandas
import collections
import matplotlib.pyplot as plt
import numpy as np
import datetime
import matplotlib.dates as mdates
from bokeh.plotting import figure, output_notebook, show
from bokeh import *
from bokeh.io import reset_output,output_file
from bokeh.models import CustomJS, DatetimeTickFormatter
from bokeh.models import FactorRange,ColumnDataSource
from bokeh.models.widgets import Dropdown,Select
from bokeh.layouts import column,row, widgetbox
import holoviews as hv
from bokeh.core.properties import value
from bokeh.palettes import Category20c
from bokeh.transform import cumsum
from bokeh.models.widgets import Panel, Tabs

reset_output()
hv.extension('bokeh')

#output_notebook()

data=pandas.read_csv("query1.csv")
def loadData(path):
    data=pandas.read_csv(path)
    return data

Query 1

the percentage of canceled flights per day, throughout the entire data set

In [8]:
reset_output()
output_notebook()

histD=[]
for x in data.itertuples():
    year= (getattr(x,"Year"))
    month= (getattr(x,"Month"))
    day= (getattr(x,"Day"))
    mydate = datetime.date(year,month, day)  #year, month, day   
    histD.append(mydate)

percentage = data['PercCancelled']

p = figure(plot_width=800, plot_height=400 ,x_axis_type="datetime",title="Percentage cancelled flights per day")
p.line(histD, percentage)
p.xaxis.formatter=DatetimeTickFormatter(
        hours=["%d %B %Y"],
        days=["%d %B %Y"],
        months=["%d %B %Y"],
        years=["%d %B %Y"],
    )
t=show(row(p),notebook_handle=True)
reset_output()
Loading BokehJS ...

Query 2

weekly percentages of delays that are due to weather, throughout the entire data set

In [9]:
data=loadData("query2.csv")



output_notebook()
N=2003


ymDict=[]
ymOrdDict=[]
percDict=[]
for x in data.itertuples():
    ym= (getattr(x,"YearWeek"))
    pe=(getattr(x,"PercDelaysPerWeek"))
    y,m=ym.split('-')
    #print("y",y)
    temp=(y,int(m),pe)
    ymOrdDict.append(temp)
ymOrdDict=sorted(ymOrdDict, key = lambda x: (x[0], x[1]))

it=iter(ymOrdDict)
for x in it:
    
    temp=(x[0],str(x[1]))
    ymDict.append(temp)
    pe=x[2]*100
    percDict.append(pe)
source = ColumnDataSource(data=dict(x=ymDict, counts=percDict))
p1 = figure(x_range=FactorRange(*ymDict), plot_width=3500, plot_height=400,title="Weekly delay of flights due to weather"
            ,tools="hover",tooltips="year-week: @x; perc: @counts % ")
p1.vbar(x='x', top='counts', width=0.5, alpha=0.5,source=source)
p1.y_range.start = 0
p1.x_range.range_padding = 0.1
p1.yaxis.axis_label = 'Weekly Percentage'
#p1.xaxis.major_label_orientation = 1
p1.xaxis.major_label_orientation = pi/3
p1.xgrid.grid_line_color = None

k=show(row(p1),notebook_handle=True)
reset_output()
Loading BokehJS ...

Query 3

the percentage of flights belonging to a given "distance group" that were able to halve their departure delays by the time they arrived at their destinations. Distance groups assort flights by their total distance in miles. Flights with distances that are less than 200 miles belong in group 1, flights with distances that are between 200 and 399 miles belong in group 2, flights with distances that are between 400 and 599 miles belong in group 3, and so on. The last group contains flights whose distances are between 2400 and 2599 miles

In [10]:
datac=loadData("query3.csv")
reset_output()
output_notebook()

groups = ['1', '2', '3', '4', '5', '6','7','8','9','10','11','12','13']
years = ["halved", "notHalved"]
colors = ["#c9d9d3", "#718dbf"]
groupList=[]
for x in datac.itertuples():
    gn= int(getattr(x,"DistGroup"))
    halved= int(getattr(x,"HalvedPerDistGroup"))
    total= int(getattr(x,"FlightsPerDistGroup"))
    nonHalved=total-halved
    perc= (getattr(x,"PercHalved"))
    temp=(gn,halved,nonHalved,total,perc)
    groupList.append(temp)
halvedList=[]
nonHalvedList=[]
percList=[]
totalList=[]
ordgroupList=sorted(groupList, key = lambda x: (x[0]))
for x in ordgroupList:
    
    halvedList.append(x[1])
    nonHalvedList.append(x[2])
    percList.append(x[4]*100)
    totalList.append(x[3])


data = {'groups' : groups,
        'halved'   : halvedList,
        'notHalved'   : nonHalvedList,
       'perc':percList,
       'total':totalList}

p2 = figure(x_range=groups,  plot_width=650,plot_height=550, title="Halved Flight Graph", tools="hover", tooltips="$name:  @$name; "
            "total: @total; "
            "perc of halved: @perc%")
p2.vbar_stack(years, x='groups', width=0.9, color=colors, source=data,legend=[value(x) for x in years])
p2.xaxis.axis_label = 'Distance Group'
p2.yaxis.axis_label = 'Total amount of flights'


k=show(p2,notebook_handle=True)
Loading BokehJS ...

Query 4

a weekly "penalty" score for each airport that depends on both the its incoming and outgoing flights. The score adds 0.5 for each incoming flight that is more than 15 minutes late, and 1 for each outgoing flight that is more than 15 minutes late.

In [6]:
%%opts Overlay [width=700 height=700 title_format="Distribution of flights for each distance group" xaxis=None yaxis=None]
%%opts Bars {+framewise}
%%opts Bars [width=600 height=2500 show_legend=False tools=['hover']]
%output max_frames=1000

%%opts Bars [invert_axes=True]
hv.extension('bokeh')

#output_file('myplot10.html')
#output_notebook()
datac=pandas.read_csv("query4.csv")
ymOrdDict=[]
menu=[]
#hv.renderer('bokeh')
#renderer = hv.plotting.mpl.MPLRenderer.instance(dpi=120)
yw=""
for x in datac.itertuples():
    ab=(getattr(x,"YearWeek"))
    airport=(getattr(x,"Airport"))
    incoming=(getattr(x,"IncomingDelaysPerAirport"))
    outgoing=(getattr(x,"OutgoingDelaysPerAirport"))
    penalty=(getattr(x,"Penalty"))
    a,b=ab.split('-')
    temp=(int(a),int(b),airport,incoming,outgoing,int(penalty))
    ymOrdDict.append(temp)
    f=list(filter(lambda x: airport in x, menu))
    if (len(f)==0):
        menu.append(airport)

#print(ymOrdDict[:1000])
ymOrdDict=sorted(ymOrdDict, key = lambda x: (x[0], x[1])) 
df = pandas.DataFrame.from_records(ymOrdDict, columns=['year','week','airport','incoming','outgoing','penalty'])
keys=[]
for x in df.itertuples():
    one=(getattr(x,"year"))
    one=(getattr(x,"week"))
kdims=[('airport','Airport'),('week','Week'),('year','Year')]
vdims = [('penalty', 'Penalty')]
macro = hv.Dataset(df,kdims,vdims,label='Weekly penalty per Airport, given:')


curve = macro.to(hv.Bars, 'Airport', 'Penalty')
#renderer = hv.plotting.mpl.MPLRenderer.instance(dpi=120)

curve
Out[6]: